
library(dplyr)
library(ggplot2)
library(scico)
library(ggtext)

# Snapshot online data ----------------------------------------------------

extractFiles <- read.csv(file = "./Data/Snapshot Online Data.csv",
                         stringsAsFactors = FALSE)

extractData %>% 
  summarise(amphiSpecies = length(unique(sp)),
            keyWords = length(unique(keyw)))

websiteData <- read.csv("./Data/Target Websites Censored.csv",
         stringsAsFactors = FALSE)

extractDataAug <- extractData %>% 
  left_join(websiteData, by = "webID")

uniquePerLang <- extractDataAug %>% 
  group_by(lang, sp) %>% 
  filter(!duplicated(sp)) %>% ## make sure each species appears once in each lang
  group_by(sp) %>% 
  mutate(appearances = length(sp)) %>% ## then count how many that times that species is in the dataset, 
  select(lang, sp, appearances) %>%  ## > 1 means it was deteced in more than one language
  filter(appearances == 1) %>% 
  group_by(lang) %>% 
  count(name = "nunispp")

langSummary <- extractDataAug %>% 
  group_by(lang) %>% 
  summarise(nspp = length(unique(sp)),
            nweb = length(unique(webID))) %>% 
  left_join(uniquePerLang)


langSummary %>% 
  mutate(lang = case_when(
    lang == "ENG" ~ "English",
    lang == "FRA" ~ "French",
    lang == "GER" ~ "German",
    lang == "JPN" ~ "Japanese",
    lang == "POR" ~ "Portuguese",
    lang == "SPA" ~ "Spanish"
  ),
  percentuni = round(nunispp / nspp *100, digits = 1),
  percentoftot = round(nspp / length(unique(extractData$sp)) *100, digits = 1)
  ) %>% 
  ggplot(aes(x = reorder(lang, -nspp))) +
  geom_linerange(aes(ymin = 0, ymax = nweb),
                 position = position_nudge(x = -0.25),
                 size = 1.05) +
  geom_linerange(aes(x = seq(1.5,6.5,1),
                     ymin = -Inf, ymax = Inf),
                 linetype = 2, alpha = 0.25) +
  geom_point(aes(y = nweb),
             position = position_nudge(x = -0.25, y = 6),
             pch = 21, size = 4, stroke = 1.25) +
  geom_text(aes(y = nweb, label = nweb),
            position = position_nudge(x = -0.25, y = 6),
            vjust = 0.5, fontface = 2, size = 2.25) +
  geom_col(aes(y = nspp,
               fill = "nspp"),
           width = 0.2,
           alpha = 1,
           ) +
  geom_col(aes(y = nunispp,
               fill = "nunispp"),
           width = 0.2) +
  geom_text(aes(y = nunispp, label = nunispp, colour = "nunispp"),
            position = position_nudge(x = 0.125, y = -5),
            hjust = 0, vjust = 0.5, fontface = 2, size = 1.75) +
  geom_text(aes(y = nunispp, label = paste0(percentuni, "%"), colour = "nunispp"),
            position = position_nudge(x = 0.125, y = 5),
            hjust = 0, vjust = 0.5, fontface = 4, size = 1.75) +
  geom_text(aes(y = nspp, label = nspp, colour = "nspp"),
            position = position_nudge(x = 0.125, y = -5),
            hjust = 0, vjust = 0.5, fontface = 2, size = 1.75) +
  geom_text(aes(y = nspp, label = paste0(percentoftot, "%"), colour = "nspp"),
            position = position_nudge(x = 0.125, y = 5),
            hjust = 0, vjust = 0.5, fontface = 4, size = 1.75) +
  geom_segment(aes(x = seq(0.9, 5.9, length.out = 6),
                   xend = seq(1.35, 6.35, length.out = 6),
                   y = langSummary %>% 
                     arrange(desc(nspp)) %>% 
                     pull(nspp),
                   yend = langSummary %>% 
                     arrange(desc(nspp)) %>% 
                     pull(nspp),
                   colour = "nspp"),
               alpha = 1) +
  geom_segment(aes(x = seq(0.9, 5.9, length.out = 6),
                   xend = seq(1.35, 6.35, length.out = 6),
                   y = langSummary %>% 
                     arrange(desc(nspp)) %>% 
                     pull(nunispp),
                   yend = langSummary %>% 
                     arrange(desc(nspp)) %>% 
                     pull(nunispp),
                   colour = "nunispp"),
               alpha = 1) +
  scale_colour_scico_d(palette = "roma", begin = 0.9, end = 0.65,
                       direction = -1) +
  scale_fill_scico_d(palette = "roma", begin = 0.9, end = 0.65,
                       direction = -1) +
  labs(x = "Language", y = "# of\nspecies") +
  scale_y_continuous(breaks = seq(0,300,100)) +
  theme_bw() +
  theme(axis.title = element_text(face = 2, hjust = 1),
        axis.title.y = element_text(hjust = 1, vjust = 1, angle = 0,
                                    margin = margin(5,5,5,0)),
        axis.title.x = element_text(margin = margin(5,0,5,0)),
        axis.title.x.top = element_text(margin = margin(5,0,5,0)),
        legend.title = element_text(face = 2),
        legend.position = "none",
        strip.background = element_blank(),
        strip.text = element_text(face = 4, hjust = 0),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(),
        axis.ticks.x = element_blank(),
        panel.grid = element_blank())

ggsave("./Figures/Species by language.png", width = 160, height = 100,
       dpi = 300, units = "mm")
ggsave("./Figures/Species by language.pdf", width = 160, height = 100,
       units = "mm")


